/**
 ******************************************************************************
 * Xenia : Xbox 360 Emulator Research Project                                 *
 ******************************************************************************
 * Copyright 2021 Ben Vanik. All rights reserved.                             *
 * Released under the BSD license - see LICENSE in the root for more details. *
 ******************************************************************************
 */

#include "xenia/gpu/primitive_processor.h"

#include <algorithm>
#include <cstring>
#include <functional>
#include <utility>

#include "xenia/base/assert.h"
#include "xenia/base/byte_order.h"
#include "xenia/base/cvar.h"
#include "xenia/base/logging.h"
#include "xenia/base/math.h"
#include "xenia/base/profiling.h"
#include "xenia/gpu/register_file.h"
#include "xenia/gpu/registers.h"
#include "xenia/gpu/shader.h"
#include "xenia/gpu/trace_writer.h"
#include "xenia/gpu/xenos.h"

// All these overrides are always safe to use as all backends are expected to
// support triangle lists and line strips.
DEFINE_bool(
    force_convert_triangle_fans_to_lists, false,
    "For host graphics API downlevel support testing only, force CPU "
    "conversion of triangle fans to triangle lists even if the host supports "
    "triangle fan primitives natively.",
    "GPU");
DEFINE_bool(
    force_convert_line_loops_to_strips, false,
    "For host graphics API downlevel support testing only, force CPU "
    "conversion of line loops to line strips even if the host supports line "
    "loop primitives natively.",
    "GPU");
DEFINE_bool(
    force_convert_quad_lists_to_triangle_lists, false,
    "For host graphics API downlevel support testing only, force CPU "
    "conversion of quad lists to quad strips even if the host supports quad "
    "list primitives natively or via geometry shader emulation.\n"
    "May also be useful for graphics debugging when the debugger doesn't "
    "display the geometry generated by geometry shaders properly.",
    "GPU");
DEFINE_bool(
    ignore_32bit_vertex_index_support, false,
    "For host graphics API downlevel testing only (useful only for Qualcomm "
    "Adreno 4xx-level host GPU testing), force indirection or pre-masking and "
    "pre-swapping of 32-bit vertex indices as if the host only supports 24-bit "
    "indices.",
    "GPU");
// TODO(Triang3l): More investigation of the cache threshold as cache lookups
// and insertions require global critical region locking, and insertions also
// require protecting pages. At 1024, the cache only made the performance worse
// (415607D4, 16-bit primitive reset index replacement).
DEFINE_int32(
    primitive_processor_cache_min_indices, 4096,
    "Smallest number of guest indices to store in the cache to try reusing "
    "later in the same frame if processing (such as primitive type conversion "
    "or reset index replacement) is performed.\n"
    "Setting this to a very high value may result in excessive CPU processing, "
    "while a very low value may result in excessive locking and lookups.\n"
    "Negative values disable caching.",
    "GPU");

namespace xe {
namespace gpu {

// SIMD processing here assumes that alignment is not required (neither AVX nor
// Neon requires it) and there's no punishment for using an unaligned access
// instruction when the data is actually aligned (AVX has separate aligned /
// unaligned movs, but they have the same performance nowadays; Neon dropped the
// alignment specifier in AArch64), but truly unaligned access may result in two
// hardware memory operations if some boundary that is >= vector size is
// crossed.
//
// Therefore, to minimize unaligned access (primarily reads - since we depend on
// the data immediately), SIMD usage here is performed according to the
// following pattern (though we try to co-align the destination and the source
// prior to calling, but still doing all the operations for more code
// correctness and fewer unobvious conditions):
// - Until the source pointer is vector-aligned, process the first indices
//   without SIMD.
//   - The best possible outcome of this is that both the source and the
//     destination will be vector-aligned (if they were co-aligned prior to the
//     call), in this case, neither load nor store instructions will be crossing
//     cache lines.
//   - The other possible outcome is that the source will be aligned (1 memory
//     read per load), while the destination will be unaligned (1-2 memory
//     writes per store).
// - Process whole vectors with SIMD.
// - If there are less elements than a vector can hold remaining, process them
//   without SIMD.
//
// We assume that indices are at least aligned to their natural alignment (2 or
// 4 bytes depending on the format) - the R6xx documentation says that in
// DRAW_INDEX, INDEX_BASE_LO is word-aligned, and that's required by host
// graphics APIs.

PrimitiveProcessor::~PrimitiveProcessor() { ShutdownCommon(); }

bool PrimitiveProcessor::InitializeCommon(
    bool full_32bit_vertex_indices_supported, bool triangle_fans_supported,
    bool line_loops_supported, bool quad_lists_supported,
    bool point_sprites_supported_without_vs_expansion,
    bool rectangle_lists_supported_without_vs_expansion) {
  full_32bit_vertex_indices_used_ = full_32bit_vertex_indices_supported;
  convert_triangle_fans_to_lists_ =
      !triangle_fans_supported || cvars::force_convert_triangle_fans_to_lists;
  convert_line_loops_to_strips_ =
      !line_loops_supported || cvars::force_convert_line_loops_to_strips;
  convert_quad_lists_to_triangle_lists_ =
      !quad_lists_supported ||
      cvars::force_convert_quad_lists_to_triangle_lists;
  // No override cvars as hosts are not required to support the fallback paths
  // since they require different vertex shader structure (for the fallback
  // HostVertexShaderTypes).
  expand_point_sprites_in_vs_ = !point_sprites_supported_without_vs_expansion;
  expand_rectangle_lists_in_vs_ =
      !rectangle_lists_supported_without_vs_expansion;

  // Initialize the index buffer for conversion of auto-indexed primitive types.
  size_t builtin_index_buffer_size = 0;
  // 32-bit, before 16-bit due to alignment (for primitive expansion - when the
  // indices encode not only the guest vertex index, but also a part needed for
  // host expansion, thus may contain values above UINT16_MAX, such as up to
  // (UINT16_MAX - 1) * 4 + 3 for point sprites).
  // Using an index buffer for point sprite and rectangle list expansion instead
  // of instancing as how instancing is implemented may vary wildly between
  // GPUs, potentially slowly (like no different instances in the same
  // wavefront) with small vertex counts per instance. Also using triangle
  // strips with primitive restart, not triangle lists, so the vertex shader may
  // be invoked once for the inner edge vertices, which is important for memory
  // export in guest shaders, not to write to the same location from two
  // invocations.
  uint32_t builtin_ib_two_triangle_strip_count = 0;
  if (expand_point_sprites_in_vs_) {
    builtin_ib_two_triangle_strip_count =
        std::max(uint32_t(UINT16_MAX), builtin_ib_two_triangle_strip_count);
  }
  if (expand_rectangle_lists_in_vs_) {
    builtin_ib_two_triangle_strip_count =
        std::max(uint32_t(UINT16_MAX / 3), builtin_ib_two_triangle_strip_count);
  }
  if (builtin_ib_two_triangle_strip_count) {
    builtin_ib_offset_two_triangle_strips_ = builtin_index_buffer_size;
    builtin_index_buffer_size +=
        sizeof(uint32_t) *
        GetTwoTriangleStripIndexCount(builtin_ib_two_triangle_strip_count);
  } else {
    builtin_ib_offset_two_triangle_strips_ = SIZE_MAX;
  }
  // 16-bit (for indirection on top of single auto-indexed vertices) - enough
  // even if the backend has primitive reset enabled all the time (Metal) as
  // auto-indexed draws are limited to UINT16_MAX vertices, not UINT16_MAX + 1.
  if (convert_triangle_fans_to_lists_) {
    builtin_ib_offset_triangle_fans_to_lists_ = builtin_index_buffer_size;
    builtin_index_buffer_size +=
        sizeof(uint16_t) * GetTriangleFanListIndexCount(UINT16_MAX);
  } else {
    builtin_ib_offset_triangle_fans_to_lists_ = SIZE_MAX;
  }
  if (convert_quad_lists_to_triangle_lists_) {
    builtin_ib_offset_quad_lists_to_triangle_lists_ = builtin_index_buffer_size;
    builtin_index_buffer_size +=
        sizeof(uint16_t) * GetQuadListTriangleListIndexCount(UINT16_MAX);
  } else {
    builtin_ib_offset_quad_lists_to_triangle_lists_ = SIZE_MAX;
  }
  if (builtin_index_buffer_size) {
    if (!InitializeBuiltinIndexBuffer(
            builtin_index_buffer_size,
            [this, builtin_ib_two_triangle_strip_count](void* mapping) {
              uint32_t* mapping_32bit = reinterpret_cast<uint32_t*>(mapping);
              if (builtin_ib_offset_two_triangle_strips_ != SIZE_MAX) {
                // Two-triangle strips.
                uint32_t* two_triangle_strip_ptr =
                    mapping_32bit +
                    builtin_ib_offset_two_triangle_strips_ / sizeof(uint32_t);
                for (uint32_t i = 0; i < builtin_ib_two_triangle_strip_count;
                     ++i) {
                  if (i) {
                    // Primitive restart.
                    *(two_triangle_strip_ptr++) = UINT32_MAX;
                  }
                  // Host vertex index within the pair in the lower 2 bits,
                  // guest primitive index in the rest.
                  uint32_t two_triangle_strip_first_index = i << 2;
                  for (uint32_t j = 0; j < 4; ++j) {
                    *(two_triangle_strip_ptr++) =
                        two_triangle_strip_first_index + j;
                  }
                }
              }
              uint16_t* mapping_16bit = reinterpret_cast<uint16_t*>(mapping);
              if (builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX) {
                // Triangle fans as triangle lists.
                // Ordered as (v1, v2, v0), (v2, v3, v0) in Direct3D.
                // https://docs.microsoft.com/en-us/windows/desktop/direct3d9/triangle-fans
                uint16_t* triangle_list_ptr =
                    mapping_16bit + builtin_ib_offset_triangle_fans_to_lists_ /
                                        sizeof(uint16_t);
                for (uint32_t i = 2; i < UINT16_MAX; ++i) {
                  *(triangle_list_ptr++) = uint16_t(i - 1);
                  *(triangle_list_ptr++) = uint16_t(i);
                  *(triangle_list_ptr++) = 0;
                }
              }
              if (builtin_ib_offset_quad_lists_to_triangle_lists_ != SIZE_MAX) {
                uint16_t* triangle_list_ptr =
                    mapping_16bit +
                    builtin_ib_offset_quad_lists_to_triangle_lists_ /
                        sizeof(uint16_t);
                // TODO(Triang3l): SIMD for faster initialization?
                for (uint32_t i = 0; i < UINT16_MAX / 4; ++i) {
                  uint16_t quad_first_index = uint16_t(i * 4);
                  // TODO(Triang3l): Find the correct order.
                  // v0, v1, v2.
                  *(triangle_list_ptr++) = quad_first_index;
                  *(triangle_list_ptr++) = quad_first_index + 1;
                  *(triangle_list_ptr++) = quad_first_index + 2;
                  // v0, v2, v3.
                  *(triangle_list_ptr++) = quad_first_index;
                  *(triangle_list_ptr++) = quad_first_index + 2;
                  *(triangle_list_ptr++) = quad_first_index + 3;
                }
              }
            })) {
      ShutdownCommon();
      return false;
    }
  }

  return true;
}

void PrimitiveProcessor::ShutdownCommon() {
  if (memory_invalidation_callback_handle_) {
    // Clear the cache if it has ever been used and unregister the invalidation
    // callback.
    {
      auto global_lock = global_critical_region_.Acquire();
      cache_map_.clear();
      cache_bucket_free_first_entry_ = SIZE_MAX;
      std::memset(cache_buckets_non_empty_l1_, 0,
                  sizeof(cache_buckets_non_empty_l1_));
      std::memset(cache_buckets_non_empty_l2_, 0,
                  sizeof(cache_buckets_non_empty_l2_));
    }
    memory_.UnregisterPhysicalMemoryInvalidationCallback(
        memory_invalidation_callback_handle_);
    memory_invalidation_callback_handle_ = nullptr;
    cache_entry_pool_.clear();
  }
}

void PrimitiveProcessor::ClearPerFrameCache() {
  if (!memory_invalidation_callback_handle_) {
    // Only do clearing if cache has ever been used.
    return;
  }
  auto global_lock = global_critical_region_.Acquire();
  for (const std::pair<CacheKey, size_t>& cache_map_entry : cache_map_) {
    cache_entry_pool_[cache_map_entry.second].free_next =
        cache_bucket_free_first_entry_;
    cache_bucket_free_first_entry_ = cache_map_entry.second;
  }
  cache_map_.clear();
  std::memset(cache_buckets_non_empty_l1_, 0,
              sizeof(cache_buckets_non_empty_l1_));
  std::memset(cache_buckets_non_empty_l2_, 0,
              sizeof(cache_buckets_non_empty_l2_));
}

bool PrimitiveProcessor::Process(ProcessingResult& result_out) {
  SCOPE_profile_cpu_f("gpu");

  const RegisterFile& regs = register_file_;
  auto vgt_draw_initiator = regs.Get<reg::VGT_DRAW_INITIATOR>();

  // Parse the primitive type and the tessellation state (VGT_OUTPUT_PATH_CNTL
  // is only used in the explicit major mode) - there are cases in games when
  // this register is left over after usage of tessellation in draws that don't
  // need it.
  xenos::PrimitiveType guest_primitive_type = vgt_draw_initiator.prim_type;
  xenos::PrimitiveType host_primitive_type = guest_primitive_type;
  bool tessellation_enabled =
      xenos::IsMajorModeExplicit(vgt_draw_initiator.major_mode,
                                 vgt_draw_initiator.prim_type) &&
      regs.Get<reg::VGT_OUTPUT_PATH_CNTL>().path_select ==
          xenos::VGTOutputPath::kTessellationEnable;
  xenos::TessellationMode tessellation_mode =
      regs.Get<reg::VGT_HOS_CNTL>().tess_mode;
  Shader::HostVertexShaderType host_vertex_shader_type;
  if (tessellation_enabled) {
    // Currently only supporting tessellation in known cases for safety, and not
    // yet converting patch strips / fans to patch lists until games using them
    // are found for easier debugging when it actually happens.
    // TODO(Triang3l): Conversion of patch strips / fans if found.
    host_vertex_shader_type = Shader::HostVertexShaderType(-1);
    switch (guest_primitive_type) {
      case xenos::PrimitiveType::kTriangleList:
        // Also supported by triangle strips and fans according to:
        // https://www.khronos.org/registry/OpenGL/extensions/AMD/AMD_vertex_shader_tessellator.txt
        // Would need to convert those to triangle lists, but haven't seen any
        // games using tessellated strips / fans so far.
        switch (tessellation_mode) {
          case xenos::TessellationMode::kDiscrete:
            // - 415607E1 - nets above barrels in the beginning of the first
            //   mission (turn right after the end of the intro) -
            //   kTriangleList.
            host_vertex_shader_type =
                Shader::HostVertexShaderType::kTriangleDomainCPIndexed;
            break;
          case xenos::TessellationMode::kContinuous:
            // - 4D5307F2 - tree building with a beehive in the beginning
            //   (visible on the start screen behind the logo), waterfall in the
            //   beginning - kTriangleList.
            host_vertex_shader_type =
                Shader::HostVertexShaderType::kTriangleDomainCPIndexed;
            break;
          default:
            break;
        }
        break;
      case xenos::PrimitiveType::kQuadList:
        switch (tessellation_mode) {
          // Also supported by quad strips according to:
          // https://www.khronos.org/registry/OpenGL/extensions/AMD/AMD_vertex_shader_tessellator.txt
          // Would need to convert those to quad lists, but haven't seen any
          // games using tessellated strips so far.
          case xenos::TessellationMode::kDiscrete:
            // Not seen in games so far.
            host_vertex_shader_type =
                Shader::HostVertexShaderType::kQuadDomainCPIndexed;
            break;
          case xenos::TessellationMode::kContinuous:
            // - 58410823 - retro screen and beams in the main menu - kQuadList.
            host_vertex_shader_type =
                Shader::HostVertexShaderType::kQuadDomainCPIndexed;
            break;
          default:
            break;
        }
        break;
      case xenos::PrimitiveType::kTrianglePatch:
        // - 4D5307E6 - water - adaptive.
        // - 4D5307ED - water - adaptive.
        host_vertex_shader_type =
            Shader::HostVertexShaderType::kTriangleDomainPatchIndexed;
        break;
      case xenos::PrimitiveType::kQuadPatch:
        // - 4D5307F1 - ground - continuous.
        // - 4D5307F2 - garden ground - adaptive.
        host_vertex_shader_type =
            Shader::HostVertexShaderType::kQuadDomainPatchIndexed;
        break;
      default:
        // TODO(Triang3l): Support line patches.
        break;
    }
    if (host_vertex_shader_type == Shader::HostVertexShaderType(-1)) {
      XELOGE(
          "Primitive processor: Unsupported tessellation mode {} for primitive "
          "type {}. Report the game to Xenia developers!",
          uint32_t(tessellation_mode), uint32_t(guest_primitive_type));
      assert_always();
      return false;
    }
  } else {
    host_vertex_shader_type = Shader::HostVertexShaderType::kVertex;
    switch (guest_primitive_type) {
      case xenos::PrimitiveType::kPointList:
        if (expand_point_sprites_in_vs_) {
          host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
          host_vertex_shader_type =
              Shader::HostVertexShaderType::kPointListAsTriangleStrip;
        }
        break;
      case xenos::PrimitiveType::kLineList:
      case xenos::PrimitiveType::kLineStrip:
      case xenos::PrimitiveType::kTriangleList:
      case xenos::PrimitiveType::kTriangleStrip:
        // Supported natively on all backends.
        break;
      case xenos::PrimitiveType::kRectangleList:
        if (expand_rectangle_lists_in_vs_) {
          host_primitive_type = xenos::PrimitiveType::kTriangleStrip;
          host_vertex_shader_type =
              Shader::HostVertexShaderType::kRectangleListAsTriangleStrip;
        }
        break;
      case xenos::PrimitiveType::kTriangleFan:
        if (convert_triangle_fans_to_lists_) {
          host_primitive_type = xenos::PrimitiveType::kTriangleList;
        }
        break;
      case xenos::PrimitiveType::kLineLoop:
        if (convert_line_loops_to_strips_) {
          host_primitive_type = xenos::PrimitiveType::kLineStrip;
        }
        break;
      case xenos::PrimitiveType::kQuadList:
        if (convert_quad_lists_to_triangle_lists_) {
          host_primitive_type = xenos::PrimitiveType::kTriangleList;
        }
        break;
      default:
        XELOGE(
            "Primitive processor: Unsupported primitive type {}. Report the "
            "game to Xenia developers!",
            uint32_t(guest_primitive_type));
        assert_always();
        return false;
    }
  }

  // Process the indices.
  uint32_t guest_draw_vertex_count = vgt_draw_initiator.num_indices;
  auto vgt_dma_size = regs.Get<reg::VGT_DMA_SIZE>();
  if (vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA &&
      guest_draw_vertex_count > vgt_dma_size.num_words) {
    XELOGW(
        "Primitive processor: {} vertices attempted to be drawn with an index "
        "buffer only containing {}. Should be fetching zero indices instead of "
        "overflowing ones, but this is a rare situation, so not handled yet. "
        "Report the game to Xenia developers!",
        guest_draw_vertex_count, vgt_dma_size.num_words);
    guest_draw_vertex_count = vgt_dma_size.num_words;
  }
  uint32_t line_loop_closing_index = 0;
  uint32_t guest_index_base = 0, guest_index_buffer_needed_bytes = 0;
  CachedResult cacheable;
  cacheable.host_draw_vertex_count = guest_draw_vertex_count;
  cacheable.host_primitive_reset_enabled = false;
  cacheable.host_index_buffer_handle = SIZE_MAX;
  if (host_vertex_shader_type ==
          Shader::HostVertexShaderType::kPointListAsTriangleStrip ||
      host_vertex_shader_type ==
          Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
    // As two-triangle strips, with guest indices being either autogenerated or
    // fetched via DMA.
    uint32_t primitive_count = guest_draw_vertex_count;
    if (host_vertex_shader_type ==
        Shader::HostVertexShaderType::kRectangleListAsTriangleStrip) {
      primitive_count /= 3;
    }
    cacheable.host_draw_vertex_count =
        GetTwoTriangleStripIndexCount(primitive_count);
    cacheable.host_index_format = xenos::IndexFormat::kInt32;
    cacheable.host_primitive_reset_enabled = true;
    assert_true(builtin_ib_offset_two_triangle_strips_ != SIZE_MAX);
    cacheable.host_index_buffer_handle = builtin_ib_offset_two_triangle_strips_;
    if (vgt_draw_initiator.source_select == xenos::SourceSelect::kAutoIndex) {
      cacheable.index_buffer_type =
          ProcessedIndexBufferType::kHostBuiltinForAuto;
      cacheable.host_shader_index_endian = xenos::Endian::kNone;
    } else {
      // There is an index buffer.
      assert_true(vgt_draw_initiator.source_select ==
                  xenos::SourceSelect::kDMA);
      if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) {
        // TODO(Triang3l): Support immediate-indexed vertices.
        XELOGE(
            "Primitive processor: Unsupported vertex index source {}. Report "
            "the game to Xenia developers!",
            uint32_t(vgt_draw_initiator.source_select));
        return false;
      }
      xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size;
      // Normalize the endian.
      cacheable.index_buffer_type =
          ProcessedIndexBufferType::kHostBuiltinForDMA;
      xenos::Endian guest_index_endian = vgt_dma_size.swap_mode;
      if (guest_index_format == xenos::IndexFormat::kInt16 &&
          (guest_index_endian != xenos::Endian::kNone &&
           guest_index_endian != xenos::Endian::k8in16)) {
        XELOGW(
            "Primitive processor: 32-bit endian swap mode {} is used for "
            "16-bit indices. This shouldn't normally be happening, but report "
            "the game to Xenia developers for investigation of the intended "
            "behavior (ignore or actually swap across adjacent indices)! "
            "Currently disabling the swap for 16-and-32 and replacing 8-in-32 "
            "with 8-in-16.",
            uint32_t(guest_index_endian));
        guest_index_endian = guest_index_endian == xenos::Endian::k8in32
                                 ? xenos::Endian::k8in16
                                 : xenos::Endian::kNone;
      }
      cacheable.host_shader_index_endian = guest_index_endian;
      // Get the index buffer memory range.
      uint32_t index_size_log2 =
          guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2;
      // The base should already be aligned, but aligning here too for safety.
      guest_index_base =
          regs[XE_GPU_REG_VGT_DMA_BASE] & ~uint32_t((1 << index_size_log2) - 1);
      guest_index_buffer_needed_bytes = guest_draw_vertex_count
                                        << index_size_log2;
      if (guest_index_base > SharedMemory::kBufferSize ||
          SharedMemory::kBufferSize - guest_index_base <
              guest_index_buffer_needed_bytes) {
        XELOGE(
            "Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes "
            "required, is out of the physical memory bounds",
            guest_index_base, guest_index_buffer_needed_bytes);
        assert_always();
        return false;
      }
    }
  } else if (vgt_draw_initiator.source_select ==
             xenos::SourceSelect::kAutoIndex) {
    // Auto-indexed - use a remapping index buffer if needed to change the
    // primitive type.
    if (tessellation_enabled &&
        tessellation_mode == xenos::TessellationMode::kAdaptive) {
      XELOGE(
          "Primitive processor: Adaptive tessellation requires 32-bit "
          "floating-point edge tessellation factors in the index buffer, but "
          "no index buffer is provided by the guest.");
      assert_always();
      return false;
    }
    cacheable.host_index_format = xenos::IndexFormat::kInt16;
    cacheable.host_shader_index_endian = xenos::Endian::kNone;
    cacheable.host_primitive_reset_enabled = false;
    cacheable.index_buffer_type = ProcessedIndexBufferType::kNone;
    if (host_primitive_type != guest_primitive_type) {
      switch (guest_primitive_type) {
        case xenos::PrimitiveType::kTriangleFan:
          assert_true(host_primitive_type ==
                      xenos::PrimitiveType::kTriangleList);
          cacheable.host_draw_vertex_count =
              GetTriangleFanListIndexCount(cacheable.host_draw_vertex_count);
          cacheable.index_buffer_type =
              ProcessedIndexBufferType::kHostBuiltinForAuto;
          assert_true(builtin_ib_offset_triangle_fans_to_lists_ != SIZE_MAX);
          cacheable.host_index_buffer_handle =
              builtin_ib_offset_triangle_fans_to_lists_;
          break;
        case xenos::PrimitiveType::kLineLoop:
          // Plus 1 element (if there's anything to draw) in the strip, still
          // auto-indexed, but the added excess index should be treated as 0 by
          // the vertex shaders.
          assert_true(host_primitive_type == xenos::PrimitiveType::kLineStrip);
          cacheable.host_draw_vertex_count =
              GetLineLoopStripIndexCount(cacheable.host_draw_vertex_count);
          if (cacheable.host_draw_vertex_count) {
            line_loop_closing_index = cacheable.host_draw_vertex_count - 1;
          }
          break;
        case xenos::PrimitiveType::kQuadList:
          assert_true(host_primitive_type ==
                      xenos::PrimitiveType::kTriangleList);
          cacheable.host_draw_vertex_count = GetQuadListTriangleListIndexCount(
              cacheable.host_draw_vertex_count);
          cacheable.index_buffer_type =
              ProcessedIndexBufferType::kHostBuiltinForAuto;
          assert_true(builtin_ib_offset_quad_lists_to_triangle_lists_ !=
                      SIZE_MAX);
          cacheable.host_index_buffer_handle =
              builtin_ib_offset_quad_lists_to_triangle_lists_;
          break;
        default:
          assert_always();
          return false;
      }
    }
  } else {
    // There is an index buffer.
    assert_true(vgt_draw_initiator.source_select == xenos::SourceSelect::kDMA);
    if (vgt_draw_initiator.source_select != xenos::SourceSelect::kDMA) {
      // TODO(Triang3l): Support immediate-indexed vertices.
      XELOGE(
          "Primitive processor: Unsupported vertex index source {}. Report the "
          "game to Xenia developers!",
          uint32_t(vgt_draw_initiator.source_select));
      return false;
    }
    xenos::IndexFormat guest_index_format = vgt_draw_initiator.index_size;
    cacheable.host_index_format = guest_index_format;
    // Normalize the endian and the reset index.
    xenos::Endian guest_index_endian = vgt_dma_size.swap_mode;
    if (guest_index_format == xenos::IndexFormat::kInt16 &&
        (guest_index_endian != xenos::Endian::kNone &&
         guest_index_endian != xenos::Endian::k8in16)) {
      XELOGW(
          "Primitive processor: 32-bit endian swap mode {} is used for 16-bit "
          "indices. This shouldn't normally be happening, but report the game "
          "to Xenia developers for investigation of the intended behavior "
          "(ignore or actually swap across adjacent indices)! Currently "
          "disabling the swap for 16-and-32 and replacing 8-in-32 with "
          "8-in-16.",
          uint32_t(guest_index_endian));
      guest_index_endian = guest_index_endian == xenos::Endian::k8in32
                               ? xenos::Endian::k8in16
                               : xenos::Endian::kNone;
    }
    bool guest_primitive_reset_enabled = false;
    uint32_t guest_primitive_reset_index_guest_endian = 0;
    if (tessellation_enabled &&
        tessellation_mode == xenos::TessellationMode::kAdaptive) {
      // Adaptive tessellation uses the index buffer not for indices, but for
      // 32-bit floating-point edge factors - no primitive reset.
      if (guest_index_format != xenos::IndexFormat::kInt32) {
        XELOGE(
            "Primitive processor: Adaptive tessellation requires 32-bit "
            "floating-point edge tessellation factors in the index buffer, but "
            "16-bit index buffer is provided by the guest.");
        assert_always();
        return false;
      }
    } else {
      if (regs.Get<reg::PA_SU_SC_MODE_CNTL>().multi_prim_ib_ena) {
        switch (guest_primitive_type) {
          case xenos::PrimitiveType::kLineStrip:
          case xenos::PrimitiveType::kTriangleFan:
          case xenos::PrimitiveType::kTriangleStrip:
          case xenos::PrimitiveType::kLineLoop:
          case xenos::PrimitiveType::kQuadStrip:
          case xenos::PrimitiveType::kPolygon:
          case xenos::PrimitiveType::k2DLineStrip:
          case xenos::PrimitiveType::k2DTriStrip:
            guest_primitive_reset_index_guest_endian = xenos::GpuSwap(
                regs.Get<reg::VGT_MULTI_PRIM_IB_RESET_INDX>().reset_indx,
                guest_index_endian);
            // - VGT, what does the guest say about its primitive reset index?
            // - It's over 0xFFFF!!!
            // - What!? 0xFFFF!? There's no way that can be stored in 16 bits!
            guest_primitive_reset_enabled =
                guest_index_format == xenos::IndexFormat::kInt16
                    ? guest_primitive_reset_index_guest_endian <= UINT16_MAX
                    : true;
            break;
          default:
            // Vulkan explicitly disallows primitive restart index for "list"
            // topologies. In Direct3D 12, it's valid for non-strips, but has
            // implementation-defined behavior. Make backend usage simpler by
            // explicitly filtering lists out, and hope the guest never uses
            // primitive reset for lists.
            break;
        }
      }
    }

    // Get the index buffer memory range.
    uint32_t index_size_log2 =
        guest_index_format == xenos::IndexFormat::kInt16 ? 1 : 2;
    // The base should already be aligned, but aligning here too for safety.
    guest_index_base =
        regs[XE_GPU_REG_VGT_DMA_BASE] & ~uint32_t((1 << index_size_log2) - 1);
    guest_index_buffer_needed_bytes = guest_draw_vertex_count
                                      << index_size_log2;
    if (guest_index_base > SharedMemory::kBufferSize ||
        SharedMemory::kBufferSize - guest_index_base <
            guest_index_buffer_needed_bytes) {
      XELOGE(
          "Primitive processor: Index buffer at 0x{:08X}, 0x{:X} bytes "
          "required, is out of the physical memory bounds",
          guest_index_base, guest_index_buffer_needed_bytes);
      assert_always();
      return false;
    }

    cacheable.host_index_format = guest_index_format;
    cacheable.host_shader_index_endian = guest_index_endian;
    uint32_t guest_index_mask_guest_endian =
        guest_index_format == xenos::IndexFormat::kInt16
            ? UINT16_MAX
            : GpuSwap(xenos::kVertexIndexMask, guest_index_endian);
    if (host_primitive_type != guest_primitive_type) {
      // Already converting to a different index type - primitive reset is
      // performed during conversion here. Also doing the endian swap here for
      // hosts not supporting 32-bit indices because indirection is only used
      // for the shared memory buffer.
      // Writing to the trace irrespective of the cache lookup result because
      // cache behavior depends on runtime configuration and state.
      trace_writer_.WriteMemoryRead(guest_index_base,
                                    guest_index_buffer_needed_bytes);
      CacheTransaction cache_transaction(
          *this, CacheKey(guest_index_base, guest_draw_vertex_count,
                          guest_index_format, guest_index_endian,
                          guest_primitive_reset_enabled, guest_primitive_type));
      if (cache_transaction.GetFoundResult()) {
        cacheable = *cache_transaction.GetFoundResult();
      } else {
        const void* guest_indices_ptr =
            memory_.TranslatePhysical(guest_index_base);
        cacheable.index_buffer_type = ProcessedIndexBufferType::kHostConverted;
        cacheable.host_primitive_reset_enabled = false;
        std::function<uint32_t(uint32_t)> host_index_count_getter;
        switch (guest_primitive_type) {
          case xenos::PrimitiveType::kTriangleFan:
            host_index_count_getter = GetTriangleFanListIndexCount;
            break;
          case xenos::PrimitiveType::kLineLoop:
            host_index_count_getter = GetLineLoopStripIndexCount;
            break;
          case xenos::PrimitiveType::kQuadList:
            host_index_count_getter = GetQuadListTriangleListIndexCount;
            break;
          default:
            assert_unhandled_case(guest_primitive_type);
            return false;
        }
        single_primitive_ranges_.clear();
        if (guest_index_format == xenos::IndexFormat::kInt16) {
          // 16-bit indices - just convert the primitive (or multiple
          // primitives) to the host topology.
          // TODO(Triang3l): 16-bit > 32-bit primitive type conversion for
          // Metal, where primitive reset is always enabled, if UINT16_MAX is
          // used as a real vertex index.
          auto guest_indices =
              reinterpret_cast<const uint16_t*>(guest_indices_ptr);
          if (guest_primitive_reset_enabled &&
              IsResetUsed(guest_indices, guest_draw_vertex_count,
                          guest_primitive_reset_index_guest_endian)) {
            // Multiple primitives in the index buffer - gather all single
            // primitives.
            cacheable.host_draw_vertex_count =
                GetMultiPrimitiveHostIndexCountAndRanges(
                    host_index_count_getter, guest_indices,
                    guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    single_primitive_ranges_);
          } else {
            cacheable.host_draw_vertex_count =
                host_index_count_getter(guest_draw_vertex_count);
            single_primitive_ranges_.emplace_back(
                0, guest_draw_vertex_count, cacheable.host_draw_vertex_count);
          }
          auto host_indices = reinterpret_cast<uint16_t*>(
              RequestHostConvertedIndexBufferForCurrentFrame(
                  xenos::IndexFormat::kInt16, cacheable.host_draw_vertex_count,
                  false, guest_index_base, cacheable.host_index_buffer_handle));
          if (!host_indices) {
            return false;
          }
          ConvertSinglePrimitiveRanges(
              host_indices, guest_indices, guest_primitive_type,
              PassthroughIndexTransform(), single_primitive_ranges_.cbegin(),
              single_primitive_ranges_.cend());
        } else {
          // 32-bit indices - may need to pre-swap and pre-mask also if the host
          // doesn't support full 32-bit vertex indices.
          auto guest_indices =
              reinterpret_cast<const uint32_t*>(guest_indices_ptr);
          if (guest_primitive_reset_enabled &&
              IsResetUsed(guest_indices, guest_draw_vertex_count,
                          guest_primitive_reset_index_guest_endian,
                          guest_index_mask_guest_endian)) {
            // Multiple primitives in the index buffer - gather all single
            // primitives.
            cacheable.host_draw_vertex_count =
                GetMultiPrimitiveHostIndexCountAndRanges(
                    host_index_count_getter, guest_indices,
                    guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    guest_index_mask_guest_endian, single_primitive_ranges_);
          } else {
            cacheable.host_draw_vertex_count =
                host_index_count_getter(guest_draw_vertex_count);
            single_primitive_ranges_.emplace_back(
                0, guest_draw_vertex_count, cacheable.host_draw_vertex_count);
          }
          auto host_indices = reinterpret_cast<uint32_t*>(
              RequestHostConvertedIndexBufferForCurrentFrame(
                  xenos::IndexFormat::kInt32, cacheable.host_draw_vertex_count,
                  false, guest_index_base, cacheable.host_index_buffer_handle));
          if (!host_indices) {
            return false;
          }
          auto single_primitive_ranges_beginning =
              single_primitive_ranges_.cbegin();
          auto single_primitive_ranges_end = single_primitive_ranges_.cend();
          if (full_32bit_vertex_indices_used_) {
            ConvertSinglePrimitiveRanges(
                host_indices, guest_indices, guest_primitive_type,
                PassthroughIndexTransform(), single_primitive_ranges_beginning,
                single_primitive_ranges_end);
          } else {
            switch (guest_index_endian) {
              case xenos::Endian::kNone:
                ConvertSinglePrimitiveRanges(host_indices, guest_indices,
                                             guest_primitive_type,
                                             To24NonSwappingIndexTransform(),
                                             single_primitive_ranges_beginning,
                                             single_primitive_ranges_end);
                break;
              case xenos::Endian::k8in16:
                ConvertSinglePrimitiveRanges(host_indices, guest_indices,
                                             guest_primitive_type,
                                             To24Swapping8In16IndexTransform(),
                                             single_primitive_ranges_beginning,
                                             single_primitive_ranges_end);
                break;
              case xenos::Endian::k8in32:
                ConvertSinglePrimitiveRanges(host_indices, guest_indices,
                                             guest_primitive_type,
                                             To24Swapping8In32IndexTransform(),
                                             single_primitive_ranges_beginning,
                                             single_primitive_ranges_end);
                break;
              case xenos::Endian::k16in32:
                ConvertSinglePrimitiveRanges(host_indices, guest_indices,
                                             guest_primitive_type,
                                             To24Swapping16In32IndexTransform(),
                                             single_primitive_ranges_beginning,
                                             single_primitive_ranges_end);
                break;
              default:
                assert_unhandled_case(guest_index_endian);
                return false;
            }
            cacheable.host_shader_index_endian = xenos::Endian::kNone;
          }
        }
        cache_transaction.SetNewResult(cacheable);
      }
    } else {
      // Using the same indices on the host as on the guest, either directly or
      // (for backends not supporting full 32-bit indices, thus unable to
      // endian-swap, or even to safely drop the upper 8 bits if no swap is even
      // needed) indirectly.
      cacheable.host_draw_vertex_count = guest_draw_vertex_count;
      cacheable.index_buffer_type = ProcessedIndexBufferType::kGuestDMA;
      cacheable.host_primitive_reset_enabled = guest_primitive_reset_enabled;
      if (guest_primitive_reset_enabled) {
        if (guest_index_format == xenos::IndexFormat::kInt16) {
          // The whole 16-bit index is compared to the primitive reset index.
          // Does not need indirection on backends not supporting full 32-bit
          // indices.
          if (guest_primitive_reset_index_guest_endian != UINT16_MAX) {
            // If primitive reset with a non-0xFFFF index is used, replace with
            // 0xFFFF if 0xFFFF is not used as a real index, or with 0xFFFFFFFF
            // if it is.
            // Writing to the trace irrespective of the cache lookup result
            // because cache behavior depends on runtime configuration and
            // state.
            // Example of 16-bit reset index replacement: 415607D4.
            trace_writer_.WriteMemoryRead(guest_index_base,
                                          guest_index_buffer_needed_bytes);
            // Not specifying the primitive type in the cache key because not
            // replacing it, only the reset index in a type-independent way.
            CacheTransaction cache_transaction(
                *this, CacheKey(guest_index_base, guest_draw_vertex_count,
                                guest_index_format, guest_index_endian,
                                guest_primitive_reset_enabled));
            if (cache_transaction.GetFoundResult()) {
              cacheable = *cache_transaction.GetFoundResult();
            } else {
              auto guest_indices =
                  memory_.TranslatePhysical<const uint16_t*>(guest_index_base);
              bool is_reset_index_used, is_ffff_used_as_vertex_index;
              Get16BitResetIndexUsage(guest_indices, guest_draw_vertex_count,
                                      guest_primitive_reset_index_guest_endian,
                                      is_reset_index_used,
                                      is_ffff_used_as_vertex_index);
              if (is_reset_index_used) {
                cacheable.index_buffer_type =
                    ProcessedIndexBufferType::kHostConverted;
                cacheable.host_index_format = is_ffff_used_as_vertex_index
                                                  ? xenos::IndexFormat::kInt32
                                                  : xenos::IndexFormat::kInt16;
                void* host_indices_ptr =
                    RequestHostConvertedIndexBufferForCurrentFrame(
                        cacheable.host_index_format, guest_draw_vertex_count,
                        true, guest_index_base,
                        cacheable.host_index_buffer_handle);
                if (!host_indices_ptr) {
                  return false;
                }
                if (is_ffff_used_as_vertex_index) {
                  ReplaceResetIndex16To24(
                      reinterpret_cast<uint32_t*>(host_indices_ptr),
                      guest_indices, guest_draw_vertex_count,
                      guest_primitive_reset_index_guest_endian);
                } else {
                  ReplaceResetIndex16To16(
                      reinterpret_cast<uint16_t*>(host_indices_ptr),
                      guest_indices, guest_draw_vertex_count,
                      guest_primitive_reset_index_guest_endian);
                }
              }
              cache_transaction.SetNewResult(cacheable);
            }
          }
        } else {
          // Low 24 bits of the guest index are compared to the primitive reset
          // index. If the backend doesn't support full 32-bit indices, for
          // ProcessedIndexBufferType::kGuestDMA, the host needs to read the
          // buffer indirectly in the vertex shaders and swap, and for
          // ProcessedIndexBufferType::kHostConverted (if primitive reset is
          // actually used, thus exactly 0xFFFFFFFF must be sent to the host for
          // it in a true index buffer), no indirection is done, but
          // pre-swapping and pre-masking is done here.
          // Writing to the trace irrespective of the cache lookup result
          // because cache behavior depends on runtime configuration and state.
          trace_writer_.WriteMemoryRead(guest_index_base,
                                        guest_index_buffer_needed_bytes);
          // Not specifying the primitive type in the cache key because not
          // replacing it, only the reset index in a type-independent way.
          CacheTransaction cache_transaction(
              *this, CacheKey(guest_index_base, guest_draw_vertex_count,
                              guest_index_format, guest_index_endian,
                              guest_primitive_reset_enabled));
          if (cache_transaction.GetFoundResult()) {
            cacheable = *cache_transaction.GetFoundResult();
          } else {
            auto guest_indices =
                memory_.TranslatePhysical<const uint32_t*>(guest_index_base);
            if (IsResetUsed(guest_indices, guest_draw_vertex_count,
                            guest_primitive_reset_index_guest_endian,
                            guest_index_mask_guest_endian)) {
              cacheable.index_buffer_type =
                  ProcessedIndexBufferType::kHostConverted;
              auto host_indices = reinterpret_cast<uint32_t*>(
                  RequestHostConvertedIndexBufferForCurrentFrame(
                      xenos::IndexFormat::kInt32, guest_draw_vertex_count, true,
                      guest_index_base, cacheable.host_index_buffer_handle));
              if (!host_indices) {
                return false;
              }
              if (full_32bit_vertex_indices_used_ ||
                  guest_index_endian == xenos::Endian::kNone) {
                ReplaceResetIndex32To24<xenos::Endian::kNone>(
                    host_indices, guest_indices, guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    guest_index_mask_guest_endian);
              } else if (guest_index_endian == xenos::Endian::k8in16) {
                ReplaceResetIndex32To24<xenos::Endian::k8in16>(
                    host_indices, guest_indices, guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    guest_index_mask_guest_endian);
              } else if (guest_index_endian == xenos::Endian::k8in32) {
                ReplaceResetIndex32To24<xenos::Endian::k8in32>(
                    host_indices, guest_indices, guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    guest_index_mask_guest_endian);
              } else if (guest_index_endian == xenos::Endian::k16in32) {
                ReplaceResetIndex32To24<xenos::Endian::k16in32>(
                    host_indices, guest_indices, guest_draw_vertex_count,
                    guest_primitive_reset_index_guest_endian,
                    guest_index_mask_guest_endian);
              } else {
                assert_unhandled_case(guest_index_endian);
                return false;
              }
              cacheable.host_shader_index_endian =
                  full_32bit_vertex_indices_used_ ? guest_index_endian
                                                  : xenos::Endian::kNone;
            }
            cache_transaction.SetNewResult(cacheable);
          }
        }
      }
    }
  }

  // Request the indices in the shared memory if they need to be accessed from
  // there on the GPU.
  if (cacheable.index_buffer_type == ProcessedIndexBufferType::kGuestDMA ||
      cacheable.index_buffer_type ==
          ProcessedIndexBufferType::kHostBuiltinForDMA) {
    // Request the index buffer memory.
    // TODO(Triang3l): Shared memory request cache.
    if (!shared_memory_.RequestRange(guest_index_base,
                                     guest_index_buffer_needed_bytes)) {
      XELOGE(
          "PrimitiveProcessor: Failed to request index buffer 0x{:08X}, 0x{:X} "
          "bytes needed, in the shared memory",
          guest_index_base, guest_index_buffer_needed_bytes);
      return false;
    }
  }

  result_out.guest_primitive_type = guest_primitive_type;
  result_out.host_primitive_type = host_primitive_type;
  result_out.host_vertex_shader_type = host_vertex_shader_type;
  result_out.tessellation_mode = tessellation_mode;
  result_out.host_draw_vertex_count = cacheable.host_draw_vertex_count;
  result_out.line_loop_closing_index = line_loop_closing_index;
  result_out.index_buffer_type = cacheable.index_buffer_type;
  result_out.guest_index_base = guest_index_base;
  result_out.host_index_format = cacheable.host_index_format;
  result_out.host_shader_index_endian = cacheable.host_shader_index_endian;
  result_out.host_primitive_reset_enabled =
      cacheable.host_primitive_reset_enabled;
  result_out.host_index_buffer_handle = cacheable.host_index_buffer_handle;
  return true;
}

bool PrimitiveProcessor::IsResetUsed(const uint16_t* source, uint32_t count,
                                     uint16_t reset_index_guest_endian) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count && (reinterpret_cast<uintptr_t>(source) &
                   (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
    --count;
    if (*(source++) == reset_index_guest_endian) {
      return true;
    }
  }
  if (count >= kSimdVectorU16Elements) {
    SimdVectorU16 reset_index_guest_endian_simd =
        ReplicateU16(reset_index_guest_endian);
    while (count >= kSimdVectorU16Elements) {
      count -= kSimdVectorU16Elements;
      SimdVectorU16 source_simd = LoadAlignedVectorU16(source);
      source += kSimdVectorU16Elements;
#if XE_ARCH_AMD64
      if (_mm_movemask_epi8(
              _mm_cmpeq_epi16(source_simd, reset_index_guest_endian_simd))) {
        return true;
      }
#elif XE_ARCH_ARM64
      uint64x1_t is_any = vreinterpret_u64_u32(vqmovn_u64(vreinterpretq_u64_u16(
          vceqq_u16(source_simd, reset_index_guest_endian_simd))));
      if (*reinterpret_cast<const uint64_t*>(&is_any)) {
        return true;
      }
#else
#error SIMD 16-bit IsResetUsed not implemented.
#endif  // XE_ARCH
    }
  }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count--) {
    if (*(source++) == reset_index_guest_endian) {
      return true;
    }
  }
  return false;
}

void PrimitiveProcessor::Get16BitResetIndexUsage(
    const uint16_t* source, uint32_t count, uint16_t reset_index_guest_endian,
    bool& is_reset_index_used_out, bool& is_ffff_used_as_vertex_index_out) {
  // Optimized for the more common case (reset index not used at all), therefore
  // not doing early-outs if both conditions are true for a simpler loop body.
  // Using the index 0xFFFF is likely not that common in general.
  // TODO(Triang3l): Revisit this - maybe the early-out will be free if this
  // function is bandwidth-bound.
  is_ffff_used_as_vertex_index_out = false;
  if (reset_index_guest_endian == UINT16_MAX) {
    is_reset_index_used_out =
        IsResetUsed(source, count, reset_index_guest_endian);
    return;
  }
  is_reset_index_used_out = false;
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count && (reinterpret_cast<uintptr_t>(source) &
                   (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
    --count;
    uint16_t index = *(source++);
    if (index == reset_index_guest_endian) {
      is_reset_index_used_out = true;
    }
    if (index == UINT16_MAX) {
      is_ffff_used_as_vertex_index_out = true;
    }
  }
  if (count >= kSimdVectorU16Elements) {
    SimdVectorU16 reset_index_guest_endian_simd =
        ReplicateU16(reset_index_guest_endian);
    SimdVectorU16 ffff_simd = ReplicateU16(UINT16_MAX);
    SimdVectorU16 is_reset_simd = ReplicateU16(0);
    SimdVectorU16 is_ffff_simd = ReplicateU16(0);
    while (count >= kSimdVectorU16Elements) {
      count -= kSimdVectorU16Elements;
      SimdVectorU16 source_simd = LoadAlignedVectorU16(source);
      source += kSimdVectorU16Elements;
#if XE_ARCH_AMD64
      is_reset_simd = _mm_or_si128(
          is_reset_simd,
          _mm_cmpeq_epi16(source_simd, reset_index_guest_endian_simd));
      is_ffff_simd =
          _mm_or_si128(is_ffff_simd, _mm_cmpeq_epi16(source_simd, ffff_simd));
#elif XE_ARCH_ARM64
      is_reset_simd = vorrq_u16(
          is_reset_simd, vceqq_u16(source_simd, reset_index_guest_endian_simd));
      is_ffff_simd = vmaxq_u16(is_ffff_simd, source_simd);
#else
#error SIMD Get16BitResetIndexUsage not implemented.
#endif  // XE_ARCH
    }
#if XE_ARCH_AMD64
    if (_mm_movemask_epi8(is_reset_simd)) {
      is_reset_index_used_out = true;
    }
    if (_mm_movemask_epi8(is_ffff_simd)) {
      is_ffff_used_as_vertex_index_out = true;
    }
#elif XE_ARCH_ARM64
    uint64x1_t is_reset_any =
        vreinterpret_u64_u32(vqmovn_u64(vreinterpretq_u64_u16(is_reset_simd)));
    if (*reinterpret_cast<const uint64_t*>(&is_reset_any)) {
      is_reset_index_used_out = true;
    }
    uint64x1_t is_ffff_any = vreinterpret_u64_u32(
        vqmovn_u64(vreinterpretq_u64_u16(vceqq_u16(is_ffff_simd, ffff_simd))));
    if (*reinterpret_cast<const uint64_t*>(&is_ffff_any)) {
      is_ffff_used_as_vertex_index_out = true;
    }
#else
#error SIMD Get16BitResetIndexUsage not implemented.
#endif  // XE_ARCH
  }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count--) {
    uint16_t index = *(source++);
    if (index == reset_index_guest_endian) {
      is_reset_index_used_out = true;
    }
    if (index == UINT16_MAX) {
      is_ffff_used_as_vertex_index_out = true;
    }
  }
}

bool PrimitiveProcessor::IsResetUsed(const uint32_t* source, uint32_t count,
                                     uint32_t reset_index_guest_endian,
                                     uint32_t low_bits_mask_guest_endian) {
  // The Xbox 360's GPU only uses the low 24 bits of the index - masking before
  // comparing.
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count && (reinterpret_cast<uintptr_t>(source) &
                   (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
    --count;
    if ((*(source++) & low_bits_mask_guest_endian) ==
        reset_index_guest_endian) {
      return true;
    }
  }
  if (count >= kSimdVectorU32Elements) {
    SimdVectorU32 reset_index_guest_endian_simd =
        ReplicateU32(reset_index_guest_endian);
    while (count >= kSimdVectorU32Elements) {
      count -= kSimdVectorU32Elements;
      SimdVectorU32 source_simd = LoadAlignedVectorU32(source);
      source += kSimdVectorU32Elements;
      SimdVectorU32 low_bits_mask_guest_endian_simd =
          ReplicateU32(low_bits_mask_guest_endian);
#if XE_ARCH_AMD64
      source_simd = _mm_and_si128(source_simd, low_bits_mask_guest_endian_simd);
      if (_mm_movemask_epi8(
              _mm_cmpeq_epi32(source_simd, reset_index_guest_endian_simd))) {
        return true;
      }
#elif XE_ARCH_ARM64
      source_simd = vandq_u32(source_simd, low_bits_mask_guest_endian_simd);
      uint64x1_t is_any = vreinterpret_u64_u32(vqmovn_u64(vreinterpretq_u64_u32(
          vceqq_u32(source_simd, reset_index_guest_endian_simd))));
      if (*reinterpret_cast<const uint64_t*>(&is_any)) {
        return true;
      }
#else
#error SIMD 32-bit IsResetUsed not implemented.
#endif  // XE_ARCH
    }
  }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count--) {
    if ((*(source++) & low_bits_mask_guest_endian) ==
        reset_index_guest_endian) {
      return true;
    }
  }
  return false;
}

void PrimitiveProcessor::ReplaceResetIndex16To16(
    uint16_t* dest, const uint16_t* source, uint32_t count,
    uint16_t reset_index_guest_endian) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count && (reinterpret_cast<uintptr_t>(source) &
                   (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
    --count;
    uint16_t index = *(source++);
    *(dest++) = index != reset_index_guest_endian ? index : UINT16_MAX;
  }
  if (count >= kSimdVectorU16Elements) {
    SimdVectorU16 reset_index_guest_endian_simd =
        ReplicateU16(reset_index_guest_endian);
    while (count >= kSimdVectorU16Elements) {
      count -= kSimdVectorU16Elements;
      // Comparison produces 0 or 0xFFFF on AVX and Neon - we need 0xFFFF as the
      // result for the primitive reset indices, so the result is
      // `index | (index == reset_index)`.
      SimdVectorU16 source_simd = LoadAlignedVectorU16(source);
      source += kSimdVectorU16Elements;
      SimdVectorU16 result_simd;
#if XE_ARCH_AMD64
      result_simd = _mm_or_si128(
          source_simd,
          _mm_cmpeq_epi16(source_simd, reset_index_guest_endian_simd));
#elif XE_ARCH_ARM64
      result_simd = vorrq_u16(
          source_simd, vceqq_u16(source_simd, reset_index_guest_endian_simd));
#else
#error SIMD ReplaceResetIndex16To16 not implemented.
#endif  // XE_ARCH
      StoreUnalignedVectorU16(dest, result_simd);
      dest += kSimdVectorU16Elements;
    }
  }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count--) {
    uint16_t index = *(source++);
    *(dest++) = index != reset_index_guest_endian ? index : UINT16_MAX;
  }
}

void PrimitiveProcessor::ReplaceResetIndex16To24(
    uint32_t* dest, const uint16_t* source, uint32_t count,
    uint16_t reset_index_guest_endian) {
#if XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count && (reinterpret_cast<uintptr_t>(source) &
                   (XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE - 1))) {
    --count;
    uint16_t index = *(source++);
    *(dest++) = index != reset_index_guest_endian ? index : UINT32_MAX;
  }
  if (count >= kSimdVectorU16Elements) {
    SimdVectorU16 reset_index_guest_endian_simd =
        ReplicateU16(reset_index_guest_endian);
    while (count >= kSimdVectorU16Elements) {
      count -= kSimdVectorU16Elements;
      SimdVectorU16 source_simd = LoadAlignedVectorU16(source);
      source += kSimdVectorU16Elements;
      // 1) Compare to the reset index as uint16, getting 0 or 0xFFFF.
      // 2) For primitive reset indices, replace the lower 16 bits with 0xFFFF
      //    via OR with the comparison result.
      // 3) Expand to 32-bit, putting 0xFFFF in the upper 16 bits where
      //    the comparison has passed, creating 0xFFFFFFFF for primitive reset
      //    or 0x0000#### for non-primitive-reset indices (including
      //    0x0000FFFF if the original index buffer had 0xFFFF, but the
      //    primitive reset index is different).
      // 4) Store.
#if XE_ARCH_AMD64
      __m128i are_reset =
          _mm_cmpeq_epi16(source_simd, reset_index_guest_endian_simd);
      __m128i result = _mm_or_si128(source_simd, are_reset);
      StoreUnalignedVectorU32(dest, _mm_unpacklo_epi16(result, are_reset));
      // Expecting kSimdVectorU16Elements / 2 to be in the immediate offset
      // part of the address.
      StoreUnalignedVectorU32(dest + kSimdVectorU16Elements / 2,
                              _mm_unpackhi_epi16(result, are_reset));
#elif XE_ARCH_ARM64
      // Interleaving the indices and 0 / 0xFFFF via st2.
      uint16x8x2_t result;
      result.val[1] = vceqq_u16(source_simd, reset_index_guest_endian_simd);
      result.val[0] = vorrq_u16(source_simd, result.val[1]);
      vst2q_u16(reinterpret_cast<uint16_t*>(dest), result);
#else
#error SIMD ReplaceResetIndex16To24 not implemented.
#endif  // XE_ARCH
      dest += kSimdVectorU16Elements;
    }
  }
#endif  // XE_GPU_PRIMITIVE_PROCESSOR_SIMD_SIZE
  while (count--) {
    uint16_t index = *(source++);
    *(dest++) = index != reset_index_guest_endian ? index : UINT32_MAX;
  }
}

template void PrimitiveProcessor::ReplaceResetIndex32To24<xenos::Endian::kNone>(
    uint32_t* dest, const uint32_t* source, uint32_t count,
    uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian);
template void
PrimitiveProcessor::ReplaceResetIndex32To24<xenos::Endian::k8in16>(
    uint32_t* dest, const uint32_t* source, uint32_t count,
    uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian);
template void
PrimitiveProcessor::ReplaceResetIndex32To24<xenos::Endian::k8in32>(
    uint32_t* dest, const uint32_t* source, uint32_t count,
    uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian);
template void
PrimitiveProcessor::ReplaceResetIndex32To24<xenos::Endian::k16in32>(
    uint32_t* dest, const uint32_t* source, uint32_t count,
    uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian);

#define XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION_NO_PASSTHROUGH(  \
    ConverterName)                                                         \
  template void PrimitiveProcessor::ConverterName(                         \
      uint32_t* dest, const uint32_t* source, uint32_t source_index_count, \
      const To24NonSwappingIndexTransform& index_transform);               \
  template void PrimitiveProcessor::ConverterName(                         \
      uint32_t* dest, const uint32_t* source, uint32_t source_index_count, \
      const To24Swapping8In16IndexTransform& index_transform);             \
  template void PrimitiveProcessor::ConverterName(                         \
      uint32_t* dest, const uint32_t* source, uint32_t source_index_count, \
      const To24Swapping8In32IndexTransform& index_transform);             \
  template void PrimitiveProcessor::ConverterName(                         \
      uint32_t* dest, const uint32_t* source, uint32_t source_index_count, \
      const To24Swapping16In32IndexTransform& index_transform);
#define XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION(ConverterName)   \
  template void PrimitiveProcessor::ConverterName(                         \
      uint16_t* dest, const uint16_t* source, uint32_t source_index_count, \
      const PassthroughIndexTransform& index_transform);                   \
  template void PrimitiveProcessor::ConverterName(                         \
      uint32_t* dest, const uint32_t* source, uint32_t source_index_count, \
      const PassthroughIndexTransform& index_transform);                   \
  XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION_NO_PASSTHROUGH(        \
      ConverterName)
XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION(TriangleFanToList)
XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION_NO_PASSTHROUGH(
    LineLoopToStrip)
// TODO(Triang3l): SIMD quad conversion maybe - 2 vectors to 3 vectors (though
// multiple quads are rarely drawn anyway).
XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION(QuadListToTriangleList)
#undef XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION_NO_PASSTHROUGH
#undef XE_GPU_PRIMITIVE_PROCESSOR_INSTANTIATE_CONVERSION

void PrimitiveProcessor::LineLoopToStrip(
    uint16_t* dest, const uint16_t* source, uint32_t source_index_count,
    const PassthroughIndexTransform& index_transform) {
  if (source_index_count <= 1) {
    // To match GetLineLoopStripIndexCount.
    return;
  }
  std::memcpy(dest, source, sizeof(*source) * source_index_count);
  dest[source_index_count] = source[0];
}
void PrimitiveProcessor::LineLoopToStrip(
    uint32_t* dest, const uint32_t* source, uint32_t source_index_count,
    const PassthroughIndexTransform& index_transform) {
  if (source_index_count <= 1) {
    // To match GetLineLoopStripIndexCount.
    return;
  }
  std::memcpy(dest, source, sizeof(*source) * source_index_count);
  dest[source_index_count] = source[0];
}

uint32_t PrimitiveProcessor::GetMultiPrimitiveHostIndexCountAndRanges(
    std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
    const uint16_t* source, uint32_t source_index_count,
    uint16_t reset_index_guest_endian,
    std::deque<SinglePrimitiveRange>& ranges_append_out) {
  uint32_t host_total_index_count = 0;
  uint32_t guest_index_offset = 0;
  for (;;) {
    uint32_t guest_primitive_offset = guest_index_offset;
    while (guest_index_offset < source_index_count) {
      if (source[guest_index_offset] == reset_index_guest_endian) {
        break;
      }
      ++guest_index_offset;
    }
    // Reset encountered or end of the index buffer - add the range.
    uint32_t guest_primitive_index_count =
        guest_index_offset - guest_primitive_offset;
    uint32_t host_primitive_index_count =
        single_primitive_guest_to_host_count(guest_primitive_index_count);
    if (host_primitive_index_count) {
      ranges_append_out.emplace_back(guest_primitive_offset,
                                     guest_primitive_index_count,
                                     host_primitive_index_count);
      host_total_index_count += host_primitive_index_count;
    }
    if (guest_index_offset >= source_index_count) {
      // End of the index buffer.
      break;
    }
    // Reset index encountered - skip.
    ++guest_index_offset;
  }
  return host_total_index_count;
}

uint32_t PrimitiveProcessor::GetMultiPrimitiveHostIndexCountAndRanges(
    std::function<uint32_t(uint32_t)> single_primitive_guest_to_host_count,
    const uint32_t* source, uint32_t source_index_count,
    uint32_t reset_index_guest_endian, uint32_t low_bits_mask_guest_endian,
    std::deque<SinglePrimitiveRange>& ranges_append_out) {
  uint32_t host_total_index_count = 0;
  uint32_t guest_index_offset = 0;
  for (;;) {
    uint32_t guest_primitive_offset = guest_index_offset;
    while (guest_index_offset < source_index_count) {
      if ((source[guest_index_offset] & low_bits_mask_guest_endian) ==
          reset_index_guest_endian) {
        break;
      }
      ++guest_index_offset;
    }
    // Reset encountered or end of the index buffer - add the range.
    uint32_t guest_primitive_index_count =
        guest_index_offset - guest_primitive_offset;
    uint32_t host_primitive_index_count =
        single_primitive_guest_to_host_count(guest_primitive_index_count);
    if (host_primitive_index_count) {
      ranges_append_out.emplace_back(guest_primitive_offset,
                                     guest_primitive_index_count,
                                     host_primitive_index_count);
      host_total_index_count += host_primitive_index_count;
    }
    if (guest_index_offset >= source_index_count) {
      // End of the index buffer.
      break;
    }
    // Reset index encountered - skip.
    ++guest_index_offset;
  }
  return host_total_index_count;
}

PrimitiveProcessor::CacheTransaction::CacheTransaction(
    PrimitiveProcessor& processor, CacheKey key)
    : processor_(processor), key_(key) {
  assert_zero(processor_.cache_currently_processing_size_bytes_);
  if (cvars::primitive_processor_cache_min_indices < 0 ||
      key_.count < uint32_t(cvars::primitive_processor_cache_min_indices)) {
    // Don't cache if the vertex count is too small.
    key_.key = 0;
  }
  if (!key_.count) {
    return;
  }
  uint32_t size_bytes =
      (key_.format == xenos::IndexFormat::kInt16 ? sizeof(uint16_t)
                                                 : sizeof(uint32_t)) *
      key_.count;
  {
    auto global_lock = processor_.global_critical_region_.Acquire();
    auto cache_map_it = processor_.cache_map_.find(key_);
    if (cache_map_it != processor_.cache_map_.end()) {
      result_ = processor_.cache_entry_pool_[cache_map_it->second].result;
      result_type_ = ResultType::kExisting;
    } else {
      // Inhibit writing the new result if the range happens to be modified
      // during the processing outside the lock.
      processor_.cache_currently_processing_base_ = key_.base;
      processor_.cache_currently_processing_size_bytes_ = size_bytes;
    }
  }
  if (result_type_ != ResultType::kExisting) {
    // Enable the invalidation callback before reading the indices.
    // Also, only enable invalidation callbacks if anything needed processing at
    // all - don't waste time in the access violation handler doing nothing if
    // the guest doesn't use anything requiring host conversion.
    if (!processor_.memory_invalidation_callback_handle_) {
      processor_.memory_invalidation_callback_handle_ =
          processor_.memory_.RegisterPhysicalMemoryInvalidationCallback(
              MemoryInvalidationCallbackThunk, &processor_);
    }
    processor_.memory_.EnablePhysicalMemoryAccessCallbacks(
        key_.base, size_bytes, true, false);
  }
}

PrimitiveProcessor::CacheTransaction::~CacheTransaction() {
  if (!key_.count || result_type_ == ResultType::kExisting) {
    return;
  }

  auto global_lock = processor_.global_critical_region_.Acquire();

  processor_.cache_currently_processing_base_ = 0;
  processor_.cache_currently_processing_size_bytes_ = 0;

  if (result_type_ == ResultType::kNewSet) {
    size_t new_entry_index;
    if (processor_.cache_bucket_free_first_entry_ != SIZE_MAX) {
      new_entry_index = processor_.cache_bucket_free_first_entry_;
      processor_.cache_bucket_free_first_entry_ =
          processor_.cache_entry_pool_[new_entry_index].free_next;
    } else {
      new_entry_index = processor_.cache_entry_pool_.size();
      processor_.cache_entry_pool_.emplace_back();
    }
    CacheEntry& new_entry = processor_.cache_entry_pool_[new_entry_index];

    // Put the entry in 1 or 2 buckets.
    uint32_t bucket_start_index = key_.base >> kCacheBucketSizeBytesLog2;
    uint32_t bucket_count = CacheEntry::GetBucketCount(key_);
    for (uint32_t link_index = 0; link_index < bucket_count; ++link_index) {
      new_entry.buckets_prev[link_index] = SIZE_MAX;
      uint32_t bucket_index = bucket_start_index + link_index;
      uint64_t& bucket_non_empty_l1_ref =
          processor_.cache_buckets_non_empty_l1_[bucket_index >> 6];
      uint64_t bucket_non_empty_l1_bit = uint64_t(1) << (bucket_index & 63);
      size_t& bucket_first_entry_ref =
          processor_.cache_bucket_first_entries_[bucket_index];
      if (bucket_non_empty_l1_ref & bucket_non_empty_l1_bit) {
        // There is at least one entry already in the bucket - link to the
        // first.
        new_entry.buckets_next[link_index] = bucket_first_entry_ref;
        CacheEntry& bucket_first_entry =
            processor_.cache_entry_pool_[bucket_first_entry_ref];
        // If the start ([0]) bucket of bucket_first_entry is bucket_index,
        // update its link [0]. Otherwise, since a cache entry may belong only
        // to at most 2 buckets, bucket_index must be its [1] bucket.
        bucket_first_entry
            .buckets_prev[size_t((bucket_first_entry.key.base >>
                                  kCacheBucketSizeBytesLog2) != bucket_index)] =
            new_entry_index;
      } else {
        new_entry.buckets_next[link_index] = SIZE_MAX;
        bucket_non_empty_l1_ref |= bucket_non_empty_l1_bit;
        processor_.UpdateCacheBucketsNonEmptyL2(bucket_index >> 6, global_lock);
      }
      bucket_first_entry_ref = new_entry_index;
    }

    new_entry.key = key_;
    new_entry.result = result_;

    processor_.cache_map_.emplace(key_, new_entry_index);
  }
}

std::pair<uint32_t, uint32_t> PrimitiveProcessor::MemoryInvalidationCallback(
    uint32_t physical_address_start, uint32_t length, bool exact_range) {
  if (length == 0 || physical_address_start >= SharedMemory::kBufferSize) {
    return std::make_pair(uint32_t(0), UINT32_MAX);
  }
  length = std::min(length, SharedMemory::kBufferSize - physical_address_start);
  uint32_t physical_address_end = physical_address_start + length;
  if (!exact_range) {
    // Invalidate entire buckets if this is an access callback rather than
    // something like a file read to disable access violation handling for a
    // bigger range for higher performance.
    physical_address_start &= ~(kCacheBucketSizeBytes - 1);
    physical_address_end =
        xe::align(physical_address_end, kCacheBucketSizeBytes);
  }
  bool any_invalidated = false;
  uint32_t bucket_index_first =
      physical_address_start >> kCacheBucketSizeBytesLog2;
  uint32_t bucket_index_last =
      (physical_address_end - 1) >> kCacheBucketSizeBytesLog2;
  uint32_t bucket_l1_bits_index_first = bucket_index_first >> 6;
  uint32_t bucket_l1_bits_index_last = bucket_index_last >> 6;
  uint32_t bucket_l2_bits_index_first = bucket_index_first >> 12;
  uint32_t bucket_l2_bits_index_last = bucket_index_last >> 12;
  auto global_lock = global_critical_region_.Acquire();
  for (uint32_t bucket_l2_bits_index = bucket_l2_bits_index_first;
       bucket_l2_bits_index <= bucket_l2_bits_index_last;
       ++bucket_l2_bits_index) {
    uint64_t bucket_l2_bits_mask = UINT64_MAX;
    if (bucket_l2_bits_index == bucket_l2_bits_index_first) {
      bucket_l2_bits_mask &=
          ~((uint64_t(1) << (bucket_l1_bits_index_first & 63)) - 1);
    }
    if (bucket_l2_bits_index == bucket_l2_bits_index_last &&
        (bucket_l1_bits_index_last & 63) != 63) {
      bucket_l2_bits_mask &=
          (uint64_t(1) << ((bucket_l1_bits_index_last & 63) + 1)) - 1;
    }
    // Not caching L2 bits because they may be modified by unlinking.
    // Loop until any bits in the 64-bit portion of the L2 bit set are left.
    while (bucket_l2_bits_mask) {
      uint32_t bucket_l2_bit_shift;
      if (!xe::bit_scan_forward(
              cache_buckets_non_empty_l2_[bucket_l2_bits_index] &
                  bucket_l2_bits_mask,
              &bucket_l2_bit_shift)) {
        break;
      }
      bucket_l2_bits_mask &= ~(uint64_t(1) << bucket_l2_bit_shift);
      uint32_t bucket_l1_bits_index =
          (bucket_l2_bits_index << 6) | bucket_l2_bit_shift;
      uint64_t bucket_l1_bits_mask = UINT64_MAX;
      if (bucket_l1_bits_index == bucket_l1_bits_index_first) {
        bucket_l1_bits_mask &=
            ~((uint64_t(1) << (bucket_index_first & 63)) - 1);
      }
      if (bucket_l1_bits_index == bucket_l1_bits_index_last &&
          (bucket_index_last & 63) != 63) {
        bucket_l1_bits_mask &=
            (uint64_t(1) << ((bucket_index_last & 63) + 1)) - 1;
      }
      // Not caching L1 bits because they may be modified by unlinking.
      // Loop over buckets until any bits in the 64-bit portion of the L1 bit
      // set are left.
      while (bucket_l1_bits_mask) {
        uint32_t bucket_l1_bit_shift;
        if (!xe::bit_scan_forward(
                cache_buckets_non_empty_l1_[bucket_l1_bits_index] &
                    bucket_l1_bits_mask,
                &bucket_l1_bit_shift)) {
          break;
        }
        bucket_l1_bits_mask &= ~(uint64_t(1) << bucket_l1_bit_shift);
        uint32_t bucket_index =
            (bucket_l1_bits_index << 6) | bucket_l1_bit_shift;
        // Invalidate the entries in the bucket, fully or partially.
        size_t entry_index = cache_bucket_first_entries_[bucket_index];
        do {
          CacheEntry& entry = cache_entry_pool_[entry_index];
          CacheKey entry_key = entry.key;
          // If the start ([0]) bucket of the entry is bucket_index, the link
          // within this bucket is its link [0]. Otherwise, since a cache entry
          // may belong only to at most 2 buckets, bucket_index must be its [1]
          // bucket.
          uint32_t entry_bucket_index_first =
              entry_key.base >> kCacheBucketSizeBytesLog2;
          assert_true((bucket_index - entry_bucket_index_first) <= 1,
                      "Cache entries only store list links within two buckets");
          size_t next_entry_index =
              entry.buckets_next[bucket_index - entry_bucket_index_first];
          // For exact_range, don't invalidate bucket entries that are outside
          // the specified range.
          if (entry_key.base < physical_address_end) {
            uint32_t entry_end = entry_key.base + entry_key.GetSizeBytes();
            if (entry_end > physical_address_end) {
              // Invalidate the entry.
              any_invalidated = true;
              // Remove the entry from the cache map.
              auto entry_map_it = cache_map_.find(entry_key);
              assert_true(entry_map_it != cache_map_.end());
              if (entry_map_it != cache_map_.end()) {
                cache_map_.erase(entry_map_it);
              }
              // Unlink the entry from the bucket's list.
              uint32_t entry_link_index_last =
                  ((entry_end - 1) >> kCacheBucketSizeBytesLog2) -
                  entry_bucket_index_first;
              assert_true(
                  entry_link_index_last <= 1,
                  "Cache entries only store list links within two buckets");
              for (uint32_t entry_link_index = 0;
                   entry_link_index <= entry_link_index_last;
                   ++entry_link_index) {
                uint32_t entry_bucket_index =
                    entry_bucket_index_first + entry_link_index;
                size_t entry_link_prev = entry.buckets_prev[entry_link_index];
                size_t entry_link_next = entry.buckets_next[entry_link_index];
                if (entry_link_prev != SIZE_MAX) {
                  CacheEntry& entry_prev = cache_entry_pool_[entry_link_prev];
                  entry_prev.buckets_next[size_t(
                      (entry_prev.key.base >> kCacheBucketSizeBytesLog2) !=
                      entry_bucket_index)] = entry_link_next;
                } else {
                  if (entry_link_next != SIZE_MAX) {
                    cache_bucket_first_entries_[entry_bucket_index] =
                        entry_link_next;
                  } else {
                    // The only entry that was remaining in the bucket - it's
                    // empty now.
                    cache_buckets_non_empty_l1_[entry_bucket_index >> 6] &=
                        ~(uint64_t(1) << (entry_bucket_index & 63));
                    UpdateCacheBucketsNonEmptyL2(entry_bucket_index >> 6,
                                                 global_lock);
                  }
                }
                if (entry_link_next != SIZE_MAX) {
                  CacheEntry& entry_next = cache_entry_pool_[entry_link_next];
                  entry_next.buckets_prev[size_t(
                      (entry_next.key.base >> kCacheBucketSizeBytesLog2) !=
                      entry_bucket_index)] = entry_link_prev;
                }
              }
              // Make the entry free for reuse.
              entry.free_next = cache_bucket_free_first_entry_;
              cache_bucket_free_first_entry_ = entry_index;
            }
          }
          entry_index = next_entry_index;
        } while (entry_index != SIZE_MAX);
      }
    }
  }
  return any_invalidated
             ? std::make_pair(physical_address_start,
                              physical_address_end - physical_address_start)
             : std::make_pair(uint32_t(0), UINT32_MAX);
}

std::pair<uint32_t, uint32_t>
PrimitiveProcessor::MemoryInvalidationCallbackThunk(
    void* context_ptr, uint32_t physical_address_start, uint32_t length,
    bool exact_range) {
  return reinterpret_cast<PrimitiveProcessor*>(context_ptr)
      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
}

}  // namespace gpu
}  // namespace xe
